{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SHARE Data in the Wide World\n",
    "\n",
    "This notebook will focus on how to export SHARE data into different formats, and how to query SHARE for specific information from your institution, say from a list of names or from a list of emails or ORCIDs that act as reseearcher identifiers.\n",
    "\n",
    "\n",
    "## Exporting a DataFrame to csv and Excel\n",
    "\n",
    "When doing an aggregation on SHARE data, it might be beneficial to export the data to a format that is easier to widely distribute, such as a csv file or and Excel file.\n",
    "\n",
    "First, we'll do a SHARE aggregation query for documents from each source that have a description, turn it into a pandas DataFrame, and export the data into both csv and Excel formats."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from sharepa import ShareSearch\n",
    "from sharepa.helpers import pretty_print\n",
    "\n",
    "description_search = ShareSearch()\n",
    "\n",
    "description_search = description_search.query(\n",
    "    'query_string', # Type of query, will accept a lucene query string\n",
    "    query='description:*', # This lucene query string will find all documents that don't have a description\n",
    "    analyze_wildcard=True  # This will make elasticsearch pay attention to the asterisk (which matches anything)\n",
    ")\n",
    "\n",
    "description_search.aggs.bucket(\n",
    "    'sources',  # Every aggregation needs a name\n",
    "    'significant_terms',  # There are many kinds of aggregations\n",
    "    field='_type',  # We store the source of a document in its type, so this will aggregate by source\n",
    "    min_doc_count=0,\n",
    "    percentage={}, # Will make the score value the percentage of all results (doc_count/bg_count)\n",
    "    size=0\n",
    ")\n",
    "\n",
    "description_results = description_search.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bg_count</th>\n",
       "      <th>doc_count</th>\n",
       "      <th>score</th>\n",
       "      <th>percent</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>key</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ut_chattanooga</th>\n",
       "      <td>270</td>\n",
       "      <td>270</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>purdue</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nist</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>figshare</th>\n",
       "      <td>295274</td>\n",
       "      <td>285986</td>\n",
       "      <td>0.968544</td>\n",
       "      <td>96.854447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>addis_ababa</th>\n",
       "      <td>1940</td>\n",
       "      <td>1878</td>\n",
       "      <td>0.968041</td>\n",
       "      <td>96.804124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>elife</th>\n",
       "      <td>454</td>\n",
       "      <td>439</td>\n",
       "      <td>0.966960</td>\n",
       "      <td>96.696035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>oaktrust</th>\n",
       "      <td>1553</td>\n",
       "      <td>1486</td>\n",
       "      <td>0.956858</td>\n",
       "      <td>95.685769</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>columbia</th>\n",
       "      <td>1762</td>\n",
       "      <td>1685</td>\n",
       "      <td>0.956300</td>\n",
       "      <td>95.629966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dailyssrn</th>\n",
       "      <td>7157</td>\n",
       "      <td>6844</td>\n",
       "      <td>0.956267</td>\n",
       "      <td>95.626659</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hacettepe</th>\n",
       "      <td>45</td>\n",
       "      <td>43</td>\n",
       "      <td>0.955556</td>\n",
       "      <td>95.555556</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>calhoun</th>\n",
       "      <td>4333</td>\n",
       "      <td>4085</td>\n",
       "      <td>0.942765</td>\n",
       "      <td>94.276483</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>krex</th>\n",
       "      <td>15</td>\n",
       "      <td>14</td>\n",
       "      <td>0.933333</td>\n",
       "      <td>93.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>citeseerx</th>\n",
       "      <td>184037</td>\n",
       "      <td>170545</td>\n",
       "      <td>0.926689</td>\n",
       "      <td>92.668865</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scholarsbank</th>\n",
       "      <td>11477</td>\n",
       "      <td>10600</td>\n",
       "      <td>0.923586</td>\n",
       "      <td>92.358630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mason</th>\n",
       "      <td>13</td>\n",
       "      <td>12</td>\n",
       "      <td>0.923077</td>\n",
       "      <td>92.307692</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>asu</th>\n",
       "      <td>13758</td>\n",
       "      <td>12573</td>\n",
       "      <td>0.913868</td>\n",
       "      <td>91.386829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pcurio</th>\n",
       "      <td>13101</td>\n",
       "      <td>11898</td>\n",
       "      <td>0.908175</td>\n",
       "      <td>90.817495</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scholarsarchiveosu</th>\n",
       "      <td>1940</td>\n",
       "      <td>1757</td>\n",
       "      <td>0.905670</td>\n",
       "      <td>90.567010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>harvarddataverse</th>\n",
       "      <td>1758</td>\n",
       "      <td>1592</td>\n",
       "      <td>0.905575</td>\n",
       "      <td>90.557452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dash</th>\n",
       "      <td>4687</td>\n",
       "      <td>4202</td>\n",
       "      <td>0.896522</td>\n",
       "      <td>89.652230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>plos</th>\n",
       "      <td>42600</td>\n",
       "      <td>37905</td>\n",
       "      <td>0.889789</td>\n",
       "      <td>88.978873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>springer</th>\n",
       "      <td>23561</td>\n",
       "      <td>20871</td>\n",
       "      <td>0.885828</td>\n",
       "      <td>88.582828</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mblwhoilibrary</th>\n",
       "      <td>516</td>\n",
       "      <td>452</td>\n",
       "      <td>0.875969</td>\n",
       "      <td>87.596899</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zenodo</th>\n",
       "      <td>22624</td>\n",
       "      <td>19678</td>\n",
       "      <td>0.869784</td>\n",
       "      <td>86.978430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uiucideals</th>\n",
       "      <td>15478</td>\n",
       "      <td>13269</td>\n",
       "      <td>0.857281</td>\n",
       "      <td>85.728130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>upennsylvania</th>\n",
       "      <td>3787</td>\n",
       "      <td>3213</td>\n",
       "      <td>0.848429</td>\n",
       "      <td>84.842884</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pdxscholar</th>\n",
       "      <td>1438</td>\n",
       "      <td>1211</td>\n",
       "      <td>0.842142</td>\n",
       "      <td>84.214186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>iastate</th>\n",
       "      <td>1753</td>\n",
       "      <td>1466</td>\n",
       "      <td>0.836281</td>\n",
       "      <td>83.628066</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>caltech</th>\n",
       "      <td>13961</td>\n",
       "      <td>11642</td>\n",
       "      <td>0.833894</td>\n",
       "      <td>83.389442</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cyberleninka</th>\n",
       "      <td>99303</td>\n",
       "      <td>82691</td>\n",
       "      <td>0.832714</td>\n",
       "      <td>83.271402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>waynestate</th>\n",
       "      <td>576</td>\n",
       "      <td>320</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>55.555556</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pubmedcentral</th>\n",
       "      <td>358339</td>\n",
       "      <td>198295</td>\n",
       "      <td>0.553373</td>\n",
       "      <td>55.337264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uwashington</th>\n",
       "      <td>14880</td>\n",
       "      <td>7910</td>\n",
       "      <td>0.531586</td>\n",
       "      <td>53.158602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dryad</th>\n",
       "      <td>6824</td>\n",
       "      <td>3412</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>50.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>texasstate</th>\n",
       "      <td>232</td>\n",
       "      <td>104</td>\n",
       "      <td>0.448276</td>\n",
       "      <td>44.827586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wash_state_u</th>\n",
       "      <td>9394</td>\n",
       "      <td>4044</td>\n",
       "      <td>0.430488</td>\n",
       "      <td>43.048755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>smithsonian</th>\n",
       "      <td>7316</td>\n",
       "      <td>3143</td>\n",
       "      <td>0.429606</td>\n",
       "      <td>42.960634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ucescholarship</th>\n",
       "      <td>34987</td>\n",
       "      <td>14982</td>\n",
       "      <td>0.428216</td>\n",
       "      <td>42.821619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scitech</th>\n",
       "      <td>296155</td>\n",
       "      <td>123650</td>\n",
       "      <td>0.417518</td>\n",
       "      <td>41.751785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>umich</th>\n",
       "      <td>1523</td>\n",
       "      <td>609</td>\n",
       "      <td>0.399869</td>\n",
       "      <td>39.986868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ghent</th>\n",
       "      <td>6305</td>\n",
       "      <td>2425</td>\n",
       "      <td>0.384615</td>\n",
       "      <td>38.461538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shareok</th>\n",
       "      <td>28843</td>\n",
       "      <td>10931</td>\n",
       "      <td>0.378983</td>\n",
       "      <td>37.898277</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>datacite</th>\n",
       "      <td>1356658</td>\n",
       "      <td>494099</td>\n",
       "      <td>0.364203</td>\n",
       "      <td>36.420306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>valposcholar</th>\n",
       "      <td>291</td>\n",
       "      <td>102</td>\n",
       "      <td>0.350515</td>\n",
       "      <td>35.051546</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>osf</th>\n",
       "      <td>1194</td>\n",
       "      <td>378</td>\n",
       "      <td>0.316583</td>\n",
       "      <td>31.658291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>iowaresearch</th>\n",
       "      <td>33846</td>\n",
       "      <td>9743</td>\n",
       "      <td>0.287863</td>\n",
       "      <td>28.786267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>noaa_nodc</th>\n",
       "      <td>10629</td>\n",
       "      <td>3024</td>\n",
       "      <td>0.284505</td>\n",
       "      <td>28.450466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>icpsr</th>\n",
       "      <td>33267</td>\n",
       "      <td>9446</td>\n",
       "      <td>0.283945</td>\n",
       "      <td>28.394505</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scholarworks_umass</th>\n",
       "      <td>3610</td>\n",
       "      <td>868</td>\n",
       "      <td>0.240443</td>\n",
       "      <td>24.044321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>erudit</th>\n",
       "      <td>1433</td>\n",
       "      <td>344</td>\n",
       "      <td>0.240056</td>\n",
       "      <td>24.005583</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>duke</th>\n",
       "      <td>1710</td>\n",
       "      <td>406</td>\n",
       "      <td>0.237427</td>\n",
       "      <td>23.742690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dataone</th>\n",
       "      <td>356537</td>\n",
       "      <td>74017</td>\n",
       "      <td>0.207600</td>\n",
       "      <td>20.759977</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>lshtm</th>\n",
       "      <td>99447</td>\n",
       "      <td>20171</td>\n",
       "      <td>0.202832</td>\n",
       "      <td>20.283166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>doepages</th>\n",
       "      <td>10751</td>\n",
       "      <td>2016</td>\n",
       "      <td>0.187517</td>\n",
       "      <td>18.751744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kent</th>\n",
       "      <td>2155</td>\n",
       "      <td>397</td>\n",
       "      <td>0.184223</td>\n",
       "      <td>18.422274</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>lwbin</th>\n",
       "      <td>390</td>\n",
       "      <td>38</td>\n",
       "      <td>0.097436</td>\n",
       "      <td>9.743590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>iwu_commons</th>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>0.062500</td>\n",
       "      <td>6.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>neurovault</th>\n",
       "      <td>3274</td>\n",
       "      <td>145</td>\n",
       "      <td>0.044288</td>\n",
       "      <td>4.428833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>biomedcentral</th>\n",
       "      <td>24831</td>\n",
       "      <td>1000</td>\n",
       "      <td>0.040272</td>\n",
       "      <td>4.027224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>crossref</th>\n",
       "      <td>931095</td>\n",
       "      <td>27178</td>\n",
       "      <td>0.029189</td>\n",
       "      <td>2.918929</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    bg_count  doc_count     score     percent\n",
       "key                                                          \n",
       "ut_chattanooga           270        270  1.000000  100.000000\n",
       "purdue                     2          2  1.000000  100.000000\n",
       "nist                       3          3  1.000000  100.000000\n",
       "figshare              295274     285986  0.968544   96.854447\n",
       "addis_ababa             1940       1878  0.968041   96.804124\n",
       "elife                    454        439  0.966960   96.696035\n",
       "oaktrust                1553       1486  0.956858   95.685769\n",
       "columbia                1762       1685  0.956300   95.629966\n",
       "dailyssrn               7157       6844  0.956267   95.626659\n",
       "hacettepe                 45         43  0.955556   95.555556\n",
       "calhoun                 4333       4085  0.942765   94.276483\n",
       "krex                      15         14  0.933333   93.333333\n",
       "citeseerx             184037     170545  0.926689   92.668865\n",
       "scholarsbank           11477      10600  0.923586   92.358630\n",
       "mason                     13         12  0.923077   92.307692\n",
       "asu                    13758      12573  0.913868   91.386829\n",
       "pcurio                 13101      11898  0.908175   90.817495\n",
       "scholarsarchiveosu      1940       1757  0.905670   90.567010\n",
       "harvarddataverse        1758       1592  0.905575   90.557452\n",
       "dash                    4687       4202  0.896522   89.652230\n",
       "plos                   42600      37905  0.889789   88.978873\n",
       "springer               23561      20871  0.885828   88.582828\n",
       "mblwhoilibrary           516        452  0.875969   87.596899\n",
       "zenodo                 22624      19678  0.869784   86.978430\n",
       "uiucideals             15478      13269  0.857281   85.728130\n",
       "upennsylvania           3787       3213  0.848429   84.842884\n",
       "pdxscholar              1438       1211  0.842142   84.214186\n",
       "iastate                 1753       1466  0.836281   83.628066\n",
       "caltech                13961      11642  0.833894   83.389442\n",
       "cyberleninka           99303      82691  0.832714   83.271402\n",
       "...                      ...        ...       ...         ...\n",
       "waynestate               576        320  0.555556   55.555556\n",
       "pubmedcentral         358339     198295  0.553373   55.337264\n",
       "uwashington            14880       7910  0.531586   53.158602\n",
       "dryad                   6824       3412  0.500000   50.000000\n",
       "texasstate               232        104  0.448276   44.827586\n",
       "wash_state_u            9394       4044  0.430488   43.048755\n",
       "smithsonian             7316       3143  0.429606   42.960634\n",
       "ucescholarship         34987      14982  0.428216   42.821619\n",
       "scitech               296155     123650  0.417518   41.751785\n",
       "umich                   1523        609  0.399869   39.986868\n",
       "ghent                   6305       2425  0.384615   38.461538\n",
       "shareok                28843      10931  0.378983   37.898277\n",
       "datacite             1356658     494099  0.364203   36.420306\n",
       "valposcholar             291        102  0.350515   35.051546\n",
       "osf                     1194        378  0.316583   31.658291\n",
       "iowaresearch           33846       9743  0.287863   28.786267\n",
       "noaa_nodc              10629       3024  0.284505   28.450466\n",
       "icpsr                  33267       9446  0.283945   28.394505\n",
       "scholarworks_umass      3610        868  0.240443   24.044321\n",
       "erudit                  1433        344  0.240056   24.005583\n",
       "duke                    1710        406  0.237427   23.742690\n",
       "dataone               356537      74017  0.207600   20.759977\n",
       "lshtm                  99447      20171  0.202832   20.283166\n",
       "doepages               10751       2016  0.187517   18.751744\n",
       "kent                    2155        397  0.184223   18.422274\n",
       "lwbin                    390         38  0.097436    9.743590\n",
       "iwu_commons               16          1  0.062500    6.250000\n",
       "neurovault              3274        145  0.044288    4.428833\n",
       "biomedcentral          24831       1000  0.040272    4.027224\n",
       "crossref              931095      27178  0.029189    2.918929\n",
       "\n",
       "[92 rows x 4 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "description_dataframe = pd.DataFrame(description_results.aggregations.sources.to_dict()['buckets'])\n",
    "\n",
    "# We will add our own \"percent\" column to make things clearer\n",
    "description_dataframe['percent'] = (description_dataframe['score'] * 100)\n",
    "\n",
    "# Let's set the source name as the index, and then drop the old column\n",
    "description_dataframe = description_dataframe.set_index(description_dataframe['key'])\n",
    "description_dataframe = description_dataframe.drop('key', 1)\n",
    "\n",
    "# Finally, we'll show the results!\n",
    "description_dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's export this pandas dataframe to a csv file, and to an excel file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "description_dataframe.to_csv('SHARE_Counts_with_Descriptions.csv')\n",
    "description_dataframe.to_excel('SHARE_Counts_with_Descriptions.xlsx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with outside data\n",
    "\n",
    "Let's say we had a list of names of researchers that were from a particular University. We're interested in seeing if their full names appear in any sources across the SHARE data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "names = [\"Susan Jones\", \"Ravi Patel\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 32 documents with contributors who have any of those names.\n",
      "Here are the first 10:\n",
      "---------\n",
      "Short- and Long-Term Outcomes for Extremely Preterm Infants -- with contributors Ravi Patel\n",
      "\"Prospective, Randomized, Multi-Center, Efficacy Non-inferiority Study of MEDIHONEY® Gel Versus Collagenase for Wound Debridement\" -- with contributors Ravi Patel, MD\n",
      "Representative structures of bHLH proteins from the Protein Data Bank -- with contributors Susan Jones\n",
      "‘It’s not what it looks like. I’m Santa’: Connecting Community through Film -- with contributors Susan Jones, Joanna McIntyre\n",
      "Evolutionary tree for SMC proteins, created using PHYLIP 69,70 -- with contributors Susan Jones, John Sgouros\n",
      "Structure of an SMC protein -- with contributors Susan Jones, John Sgouros\n",
      "Sequence alignments for three motifs shared by proteins in the cohesion network -- with contributors Susan Jones, John Sgouros\n",
      "Sequence alignment of the conserved motif in Scc2, Chk1 and Pkh1, which includes the PROSITE serine/threonine (S/T) protein kinase motif -- with contributors Susan Jones, John Sgouros\n",
      "The cohesion interaction network -- with contributors Susan Jones, John Sgouros\n",
      "Ultrastructure of the spermatheca in female neotenics, alates and de-alates of the subterranean termite Reticulitermes flavipes (Isoptera: Rhinotermitidae) -- with contributors El-Desouky Ammar, Susan Jones\n"
     ]
    }
   ],
   "source": [
    "name_search = ShareSearch()\n",
    "\n",
    "for name in names:\n",
    "    name_search = name_search.query(\n",
    "        {\n",
    "            \"bool\": {\n",
    "                \"should\": [\n",
    "                    {\n",
    "                        \"match\": {\n",
    "                            \"contributors.name\": {\n",
    "                                \"query\": name, \n",
    "                                \"operator\": \"and\",\n",
    "                                \"type\" : \"phrase\"\n",
    "                            }\n",
    "                        }\n",
    "                    }\n",
    "                ]\n",
    "            }\n",
    "        }\n",
    "    )\n",
    "\n",
    "\n",
    "name_results = name_search.execute()\n",
    "\n",
    "print('There are {} documents with contributors who have any of those names.'.format(name_search.count()))\n",
    "print('Here are the first 10:')\n",
    "print('---------')\n",
    "for result in name_results:\n",
    "    print(\n",
    "        '{} -- with contributors {}'.format(\n",
    "            result.title.encode('utf-8'),\n",
    "            ', '.join([contributor.name.encode('utf-8') for contributor in result.contributors])\n",
    "        )\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we were interested to see an analysis of what sources these names came from, we can add an aggregation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doc_count</th>\n",
       "      <th>key</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11</td>\n",
       "      <td>datacite</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>pubmedcentral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>clinicaltrials</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>arxiv_oai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>citeseerx</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   doc_count             key\n",
       "0         11        datacite\n",
       "1         10        crossref\n",
       "2          6   pubmedcentral\n",
       "3          3  clinicaltrials\n",
       "4          1       arxiv_oai\n",
       "5          1       citeseerx"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_search.aggs.bucket(\n",
    "    'sources',  # Every aggregation needs a name\n",
    "    'terms',  # There are many kinds of aggregations, terms is a pretty useful one though\n",
    "    field='_type',  # We store the source of a document in its type, so this will aggregate by source\n",
    "    size=0,  # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs\n",
    "    min_doc_count=1\n",
    ")\n",
    "\n",
    "name_results = name_search.execute()\n",
    "\n",
    "pd.DataFrame(name_results.aggregations.sources.to_dict()['buckets'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Say instead of names, which can be a little more arbitrary, we'd like to search by email address or ORCID instead."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "orcids = [\n",
    "    'http://orcid.org/0000-0003-1942-4543',\n",
    "    'http://orcid.org/0000-0003-4875-1447',\n",
    "    'http://orcid.org/0000-0002-6085-4433',\n",
    "    'http://orcid.org/0000-0002-7995-9948',\n",
    "    'http://orcid.org/0000-0002-2170-853X',\n",
    "    'http://orcid.org/0000-0002-8899-9087'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "orcid_search = ShareSearch()\n",
    "\n",
    "for orcid in orcids:\n",
    "    orcid_search = orcid_search.query(\n",
    "        {\n",
    "            \"bool\": {\n",
    "                \"should\": [\n",
    "                    {\n",
    "                        \"match\": {\n",
    "                            \"contributors.sameAs\": {\n",
    "                                \"query\": orcid, \n",
    "                                \"operator\": \"and\",\n",
    "                                \"type\" : \"phrase\"\n",
    "                            }\n",
    "                        }\n",
    "                    }\n",
    "                ]\n",
    "            }\n",
    "        }\n",
    "    )\n",
    "\n",
    "orcid_search.aggs.bucket(\n",
    "    'sources',  # Every aggregation needs a name\n",
    "    'terms',  # There are many kinds of aggregations, terms is a pretty useful one though\n",
    "    field='_type',  # We store the source of a document in its type, so this will aggregate by source\n",
    "    size=0,  # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs\n",
    "    min_doc_count=1\n",
    ")\n",
    "\n",
    "orcid_results = orcid_search.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 8 documents with contributors who have any of those orcids.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>docID</th>\n",
       "      <th>source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Widespread shortening of 3' untranslated regio...</td>\n",
       "      <td>10.1101/026831</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Light-induced indeterminacy alters shade avoid...</td>\n",
       "      <td>10.1101/024018</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A psychometric analysis of outcome measures in...</td>\n",
       "      <td>10.1136/annrheumdis-2014-207235</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COMADRE: a global database of animal demography</td>\n",
       "      <td>10.1101/027821</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Optimisation of a treat-to-target approach in ...</td>\n",
       "      <td>10.1136/annrheumdis-2015-208324</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A selfish genetic element drives recurring sel...</td>\n",
       "      <td>10.1101/024851</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Pharmacological treatment of psoriatic arthrit...</td>\n",
       "      <td>10.1136/annrheumdis-2015-208466</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>In wealthier countries, patients perceive wors...</td>\n",
       "      <td>10.1136/annrheumdis-2015-207738</td>\n",
       "      <td>crossref</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               title  \\\n",
       "0  Widespread shortening of 3' untranslated regio...   \n",
       "1  Light-induced indeterminacy alters shade avoid...   \n",
       "2  A psychometric analysis of outcome measures in...   \n",
       "3    COMADRE: a global database of animal demography   \n",
       "4  Optimisation of a treat-to-target approach in ...   \n",
       "5  A selfish genetic element drives recurring sel...   \n",
       "6  Pharmacological treatment of psoriatic arthrit...   \n",
       "7  In wealthier countries, patients perceive wors...   \n",
       "\n",
       "                             docID    source  \n",
       "0                   10.1101/026831  crossref  \n",
       "1                   10.1101/024018  crossref  \n",
       "2  10.1136/annrheumdis-2014-207235  crossref  \n",
       "3                   10.1101/027821  crossref  \n",
       "4  10.1136/annrheumdis-2015-208324  crossref  \n",
       "5                   10.1101/024851  crossref  \n",
       "6  10.1136/annrheumdis-2015-208466  crossref  \n",
       "7  10.1136/annrheumdis-2015-207738  crossref  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('There are {} documents with contributors who have any of those orcids.'.format(orcid_search.count()))\n",
    "\n",
    "all_agg_df = pd.DataFrame()\n",
    "all_agg_df['title'] = [result.title for result in orcid_results]\n",
    "all_agg_df['docID'] = [result.shareProperties.docID for result in orcid_results]\n",
    "all_agg_df['source'] = [result.shareProperties.source for result in orcid_results]\n",
    "all_agg_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
